"""
The code is released exclusively for review purposes with the following terms:
PROPRIETARY AND CONFIDENTIAL. UNAUTHORIZED USE, COPYING, OR DISTRIBUTION OF THE 
CODE, VIA ANY MEDIUM, IS STRICTLY PROHIBITED. BY ACCESSING THE CODE, THE 
REVIEWERS AGREE TO DELETE THEM FROM ALL MEDIA AFTER THE REVIEW PERIOD IS OVER.
"""

import os
import pandas as pd
from sklearn.datasets import fetch_openml

def default_preprocessing(df):
    
    # Remove rows with NaNs (missing data)
    keep_inds_data = pd.isna(df["data"]).sum(axis=1) == 0
    keep_inds_target = pd.isna(df["target"]) == 0
    keep_inds = (keep_inds_data & keep_inds_target)

    df["data"] = df["data"].loc[keep_inds, :]
    df["target"] = df["target"].loc[keep_inds]

    # Feature and target names
    feature_names = list(df["data"].columns)
    target_name = "MPG"

    # categorical columns
    categorical_feature_inds = []
    for (idx, f) in enumerate(feature_names):
        if df["data"][f].dtype.name == "category":
            categorical_feature_inds.append(idx)
            # Convert to numeric
            df["data"][f] = pd.to_numeric(df["data"][f])

    X = df["data"].values
    y = df["target"].values
    
    return X, y, feature_names, target_name, categorical_feature_inds

class AutoMPGDataset():
    """
    Dataset describing relationship between various input variables and mileage 
    (in miles per gallon) of automobiles. This class uses Scikit-Learn Open ML 
    API to load the data.
    References:
        .. [#] `Auto MPG Data Set <https://archive.ics.uci.edu/ml/datasets/Auto+MPG/>`_
    """

    def __init__(self, custom_preprocessing=default_preprocessing):

        # load data
        try:
            df = fetch_openml(name="autoMpg", as_frame=True)
        except ValueError as err:
            print("ValueError: {}".format(err))
            print("To use this class, please check the name of the Open ML dataset:")
            import sys
            sys.exit(1)
            
        if custom_preprocessing:
            self._data = custom_preprocessing(df)

    def data(self):
        return self._data
